import os
import pandas as pd
import csv
import numpy as np
import math
from matplotlib.lines import Line2D
import matplotlib as mpl
from matplotlib.patches import Patch
import matplotlib.pyplot as plt
import random
from scipy.spatial import distance
from scipy import stats
from sklearn import manifold
from sklearn.decomposition import PCA
import statsmodels.stats.multitest as sm

Summary

This file includes the results of a microbiome analysis performed on samples taken from four individuals that were originally used to determine the “Impact of DNA source on genetic variant detection from human whole-genome sequencing data”.

This included blood, saliva and buccal samples taken from four individuals (blood samples were taken at a different time than saliva and buccal samples). Additionally, a methylation-based enrichment for eukaryotic DNA was performed on the saliva and buccal samples.

Basic processing steps

Obtaining reads

Fastq.gz files were downloaded from the ENA database, project accession number PRJNA523344

Kneaddata

Kneaddata was used for quality control and removal of human sequences. This included:
- Trimmomatic 0.39: “SLIDINGWINDOW:4:20 MINLEN:50”
- Bowtie2 with the GRCh38_PhiX database (to remove human and PhiX reads): “–fast –dovetail”

Reads kept after Kneaddata

#3 
#set up colors function (to get up to 120 colors, but with up to 40 unique colors)
def get_cols(num):
    colormap_20, colormap_40b, colormap_40c = mpl.cm.get_cmap('tab20', 256), mpl.cm.get_cmap('tab20b', 256), mpl.cm.get_cmap('tab20c', 256)
    norm, norm2 = mpl.colors.Normalize(vmin=0, vmax=19), mpl.colors.Normalize(vmin=20, vmax=39)
    m1, m2, m3 = mpl.cm.ScalarMappable(norm=norm, cmap=colormap_20), mpl.cm.ScalarMappable(norm=norm, cmap=colormap_40b), mpl.cm.ScalarMappable(norm=norm2, cmap=colormap_40c)
    colors_20 = [m1.to_rgba(a) for a in range(20)]
    colors_40 = [m2.to_rgba(a) for a in range(20)]+[m3.to_rgba(a) for a in range(20,40)]
    if num < 21: return colors_20
    elif num < 41: return colors_40
    else: return colors_40+colors_40+colors_40
#and colors and shapes for different participants and body sites
colors_dict, shapes_dict = {'Blood':'#900C3F', 'Saliva':'#016F85', 'Buccal':'#ff8300', 'Saliva_euk':'#02aed1', 'Buccal_euk':'#FFC300'}, {'Huref':'o', 'PGPC-0002':'^', 'PGPC-0005':'*', 'PGPC-0006':'s', 'PGPC-0050':'p'}
#4
#get numbers of reads for different steps
reads = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/read_counts.txt', sep='\t', index_col=3, header=0)
participant_dict, site_dict, full_name_dict = {}, {}, {}
samples = list(reads.index.values)
for s in samples:
    participant_dict[s] = reads.loc[s, 'Participant']
    site_dict[s] = reads.loc[s, 'Body site']
    full_name_dict[s] = reads.loc[s, 'Participant']+' '+reads.loc[s, 'Body site']
total_reads = pd.DataFrame(reads.loc[:, 'cat_reads'])
sample_names = [participant_dict[name]+' '+site_dict[name] for name in samples]
colors = [colors_dict[s] for s in list(reads.loc[:, 'Body site'].values)]
shapes = [shapes_dict[s] for s in list(reads.loc[:, 'Participant'].values)]

Percentage reads remaining

#5
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Reads kept (%)')
plt.xlim([-0.5,20.5])
plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'Percentage'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads kept (%)')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Number of reads remaining

#6
plt.figure(figsize=(10, 5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
plt.sca(ax1)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.xlim([-0.5,20.5])
plt.ylabel('Reads remaining')

plt.sca(ax2)
plt.bar(list(reads.index.values), reads.loc[:, 'cat_reads'].values, color=colors, edgecolor='k')
plt.semilogy()
plt.xticks(list(reads.index.values), sample_names, rotation=90)
plt.ylabel('Log reads remaining')
handles = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
ax2.legend(handles=handles, bbox_to_anchor=(1.4,1.05))
plt.xlim([-0.5,20.5])
plt.tight_layout()
plt.show()

Table of reads remaining

#7
reads_remain = reads.loc[:, ['Percentage', 'cat_reads']].rename(index=full_name_dict, columns = {'cat_reads':'Number'})
#8
py$reads_remain %>%
  kable() %>%
  kable_styling()
Percentage Number
Huref Blood 0.2319067 1087311
PGPC-0002 Blood 0.5748318 3321664
PGPC-0005 Blood 0.5149286 1903482
PGPC-0006 Blood 0.7146067 3593441
PGPC-0050 Blood 0.6870164 2940390
PGPC-0002 Saliva 3.9289510 16561343
PGPC-0005 Saliva 55.6458826 244058333
PGPC-0006 Saliva 13.1608121 60497393
PGPC-0050 Saliva 56.3700720 257049004
PGPC-0002 Buccal 2.8112226 11462781
PGPC-0005 Buccal 2.8202504 12751614
PGPC-0006 Buccal 5.1521560 22434503
PGPC-0050 Buccal 2.4297107 10667011
PGPC-0002 Saliva_euk 1.2333408 5345888
PGPC-0005 Saliva_euk 8.3104464 38138382
PGPC-0006 Saliva_euk 1.9067564 8786635
PGPC-0050 Saliva_euk 5.6187272 25037552
PGPC-0002 Buccal_euk 0.9385856 4380036
PGPC-0005 Buccal_euk 1.2119820 5584415
PGPC-0006 Buccal_euk 1.5567291 7232280
PGPC-0050 Buccal_euk 1.3836141 6382288

Taxonomic profiling

The taxonomy has been profiled using:
1. HUMAnN2
- MetaPhlAn2
2. Kraken2 with Bracken
- GTDB (no confidence parameter set) -
using the database constructed using Struo, release 89 - GTDB (confidence = 0.1)
- Minikraken v1 (no human genome, no confidence parameter set)
- Minikraken v1 (no human genome, confidence = 0.1)
- Minikraken v2 (with human genome, no confidence parameter set)
- Minikraken v2 (with human genome, confidence = 0.1)
- RefSeq Complete v93 (no confidence parameter set)
- RefSeq Complete v93 (confidence = 0.1)

MetaPhlAn2 taxonomy nMDS plots

#9
#get the taxonomy file and sort it to strain and genus level
taxa = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/metaphlan_humann/processing/humann2_final_out_90/metaphlan_merged.tsv', sep='\t', header=0, index_col=0)
tax_names = list(taxa.index.values)
keeping = []
for a in range(len(tax_names)):
    if 't__' in tax_names[a]:
        keeping.append(True)
    elif 'unclassified' in tax_names[a]:
        keeping.append(True)
    else:
        keeping.append(False)
strain = taxa.loc[keeping, :]
strain_names = list(strain.index.values)
strain_dict = {}
for i in range(len(strain_names)):
    strain_dict[strain_names[i]] = strain_names[i].split('|s__')[0].split('|g__')[1]
genus = strain.rename(index=strain_dict)
genus = genus.groupby(by=genus.index, axis=0).sum()
#10
#define the function that calculates the nmds plots

def transform_for_NMDS(df, dist_met='braycurtis'):
    X = df.iloc[0:].values
    y = df.iloc[:,0].values
    seed = np.random.RandomState(seed=3)
    X_true = X
    similarities = distance.cdist(X_true, X_true, dist_met)
    mds = manifold.MDS(n_components=2, max_iter=3000, eps=1e-9, random_state=seed,
                   dissimilarity="precomputed", n_jobs=1)
    #print(similarities)
    pos = mds.fit(similarities).embedding_
    nmds = manifold.MDS(n_components=2, metric=False, max_iter=3000, eps=1e-12,
                        dissimilarity="precomputed", random_state=seed, n_jobs=1,
                        n_init=1)
    npos = nmds.fit_transform(similarities, init=pos)
    # Rescale the data
    pos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((pos ** 2).sum())
    npos *= np.sqrt((X_true ** 2).sum()) / np.sqrt((npos ** 2).sum())
    # Rotate the data
    clf = PCA()
    X_true = clf.fit_transform(X_true)
    pos = clf.fit_transform(pos)
    npos = clf.fit_transform(npos)
    return pos, npos, nmds.stress_
strain_t = strain.transpose()
genus_t = genus.transpose()

handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

Bray-Curtis distance

#11
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'braycurtis')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'braycurtis')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
plt.tight_layout()
plt.show()

Euclidean distance

#12
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'euclidean')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'euclidean')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

Jaccard distance

#13
strain_pos, strain_npos, strain_stress = transform_for_NMDS(strain_t, 'jaccard')
genus_pos, genus_npos, genus_stress = transform_for_NMDS(genus_t, 'jaccard')

plt.figure(figsize=(10,5))
ax1, ax2 = plt.subplot(121), plt.subplot(122)
for a in range(len(strain_npos)):
    ax1.scatter(strain_npos[a,0], strain_npos[a,1], marker=shapes[a], color=colors[a], s=100)
    ax2.scatter(genus_npos[a,0], genus_npos[a,1], marker=shapes[a],  color=colors[a], s=100)
ax1.set_xlabel('nMDS1')
ax2.set_xlabel('nMDS1')
ax1.set_ylabel('nMDS2')
ax1.set_title('Strain')
ax2.set_title('Genus')

ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1.4,1))
plt.tight_layout()
plt.show()

MetaPhlAn2 taxonomy relative abundance

Kingdom

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Kingdom level for each sample.

#14
plt.figure(figsize=(7,5))
ax1 = plt.subplot(111)
plt.bar(list(taxa.columns.values), taxa.loc['k__Viruses', :].values, color='#C70039', edgecolor='k')
plt.bar(list(taxa.columns.values), taxa.loc['k__Bacteria', :].values, bottom=taxa.loc['k__Viruses', :].values, color='#026B81', edgecolor='k')
#plt.xticks(list(taxa.columns.values), sample_names, rotation=90)
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
handles = [Patch(facecolor='#C70039', edgecolor='k', label='Viruses'), Patch(facecolor='#026B81', edgecolor='k', label='Bacteria')]
plt.legend(handles=handles, bbox_to_anchor=(1,1.05))
plt.tight_layout()
plt.show()

Genus

Here the relative abundance of taxa calulated by MetaPhlAn2 are plotted at the Genus level for each sample. Genera with below 1% maximum relative abundance have been removed.

#15
genus = genus[genus.max(axis=1) > 1]
genera = list(genus.index.values)
plt.figure(figsize=(10,5))
ax1 = plt.subplot(111)
gen_colors = get_cols(len(genus.index.values))
handles = []
for g in range(len(genera)):
    this_gen = genus.loc[genera[g], :].values
    if g == 0:
        ax1.bar(list(genus.columns.values), this_gen, color=gen_colors[g], edgecolor='k')
        total = this_gen
    else:
        ax1.bar(list(genus.columns.values), this_gen, bottom=total, color=gen_colors[g], edgecolor='k')
        total = this_gen+total
    handles.append(Patch(facecolor=gen_colors[g], edgecolor='k', label=genera[g]))
empty = []
for x in range(0,21):
    empty.append('')
    ax1.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
plt.xticks(range(0, 21), empty)
plt.xlim([-0.5,20.5])
plt.legend(handles=handles, bbox_to_anchor=(1,1.05), ncol=2)
plt.tight_layout()
plt.show()

Kraken2 reads classified

#16
#get all samples into dataframes based on the database that they use
folders = sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2'))
del folders[0]
kraken_columns = {0:'Percent fragments clade', 1:'Number fragments clade', 2:'Number fragments taxon', 3:'Level', 4:'NCBI ID', 5:'Taxon name'}
kraken_all_db, bracken_all_db, all_domains = [], [], {}
for fol in folders:
    if fol == 'db_genera':
      continue
    if not os.path.isdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol):
        continue
    bracken, kraken, bracken_kreport = [], [], []
    bracken_pd, kraken_pd = [], []
    for fi in sorted(os.listdir('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol)):
        if fi[-7:] == 'bracken':
            bracken.append(fi)
        elif fi[-7:] == 'kreport' and 'bracken' not in fi:
            kraken.append(fi)
        elif fi[-7:] == 'kreport':
            bracken_kreport.append(fi)
    for bk in bracken_kreport:
        with open('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+bk, 'rU') as f:
            bk = []
            domains = {}
            this_domain, domain_name = [], ''
            for row in csv.reader(f, delimiter='\t'):
                bk.append(row)
                row[5] = row[5].lstrip()
                if row[3] == 'D':
                    if domain_name != '':
                        domains[domain_name] = this_domain
                    this_domain, domain_name = [], row[5]
                else:
                    if row[3] != 'R' and row[3] != 'U' and 'D' not in row[3]:
                        this_domain.append(row[5])
            domains[domain_name] = this_domain
            for domain in domains:
                if domain in all_domains:
                    all_domains[domain] = list(set(all_domains[domain]+domains[domain]))
                else:
                    all_domains[domain] = list(set(domains[domain]))
    for b in bracken:
        if len(b) > 22:
            continue
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+b, sep='\t', header=0, index_col=0)
        b = b.replace('_150', '')
        sample.drop(['taxonomy_id', 'taxonomy_lvl', 'kraken_assigned_reads', 'added_reads', 'fraction_total_reads'], axis=1, inplace=True)
        sample.rename(columns={'new_est_reads':b[:-8]}, inplace=True)
        bracken_pd.append(sample)
    for k in kraken:
        sample = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/'+fol+'/'+k, sep='\t', header=None, index_col=3)
        sample = sample.loc[['U', 'D'], :]
        sample = sample.rename(columns=kraken_columns).drop(['Number fragments taxon', 'NCBI ID'], axis=1).rename(columns={'Percent fragments clade':k[:-8]+'_percent', 'Number fragments clade':k[:-8]+'_reads'}).set_index('Taxon name')
        taxa = list(sample.index.values)
        taxa_dict = {}
        for t in taxa:
            taxa_dict[t] = t.replace(' ', '')
        sample = sample.rename(index=taxa_dict)
        kraken_pd.append(sample)
    bracken = pd.concat(bracken_pd, join='outer')
    kraken = pd.concat(kraken_pd, join='outer')
    kraken = kraken.rename(index={'d__Bacteria':'Bacteria', 'd__Archaea':'Archaea'})
    kraken = kraken.groupby(by=kraken.index, axis=0).sum()
    bracken = bracken.groupby(by=bracken.index, axis=0).sum().fillna(value=0)
    kraken_all_db.append(kraken), bracken_all_db.append(bracken)
#17
x1 = [x for x in range(21)]
x2 = [x+0.3 for x in range(21)]
tax_plotting = ['Archaea', 'Bacteria', 'Eukaryota', 'Viruses', 'unclassified']
color_plotting = ['#EDBB99', '#5499C7', '#7DCEA0', '#F7DC6F', '#CCD1D1']
tax_paper = ['Bacteria', 'Eukaryota', 'Other', 'Unclassified']
color_paper = ['#5499C7', '#7DCEA0', '#CD6155', '#CCD1D1']
from_paper = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/from_paper.csv', header=0, index_col=0)

def get_summary_reads(kraken_db):
    fig = plt.figure(figsize=(15,15))
    ax1, ax2, ax3, ax4, ax5 = plt.subplot(321), plt.subplot(322), plt.subplot(323), plt.subplot(324), plt.subplot(325)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax5.set_title('From paper')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4, ax5]
    for s in range(len(samples)):
        if s == 0:
            continue
        bottom = 0
        for t in range(len(tax_paper)):
            ax5.bar(x1[s], from_paper.loc[tax_paper[t], samples[s]], bottom=bottom, color=color_paper[t], edgecolor='k', width=0.6)
            bottom += from_paper.loc[tax_paper[t], samples[s]]
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        db = kraken_db[db]
        handles = []
        for tax in range(len(tax_plotting)):
            handles.append(Patch(facecolor=color_plotting[tax], edgecolor='k', label=tax_plotting[tax]))
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        handles.append(Patch(facecolor=color_paper[2], edgecolor='k', label='Other'))
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                prop = (prop/cat)*100
                ax_using.bar(x[s], prop, bottom=bottom, color=color_plotting[t], edgecolor='k', width=0.3)
                bottom += prop
    ax2.legend(handles=handles, bbox_to_anchor=(1,1.05))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        ax5.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
        ax4.text(x, -2, sample_names[x], color=colors[x], rotation=90, va='top', ha='center')
    ax1.set_ylabel('Classified (%)'), ax3.set_ylabel('Classified (%)'), ax5.set_ylabel('Classified(%)')
    #plt.tight_layout()
    return

def get_summary_bacteria(kraken_db):
    tax_plotting = ['Bacteria']
    alpha = ['#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F', '#5499C7', '#F1C40F']
    
    fig = plt.figure(figsize=(10,8))
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    ax_plot = [ax3, ax3, ax1, ax1, ax2, ax2, ax4, ax4]
    x_plot = [x1, x2, x1, x2, x1, x2, x1, x2]
    axs = [ax1, ax2, ax3, ax4]
    
    for db in range(len(kraken_db)):
        ax_using = ax_plot[db]
        x = x_plot[db]
        alp = alpha[db]
        db = kraken_db[db]
        for tax in range(len(tax_plotting)):
            tax = tax_plotting[tax]
            if tax not in list(db.index.values):
                db.loc[tax] = [0 for i in range(db.shape[1])]
        db = db.fillna(value=0)
        for s in range(len(samples)):
            bottom = 0
            for t in range(len(tax_plotting)):
                prop = db.loc[tax_plotting[t], samples[s]+'_reads']
                cat = total_reads.loc[samples[s], 'cat_reads']
                ax_using.bar(x[s], prop, bottom=bottom, color=alp, edgecolor='k', width=0.3)
                bottom += prop
    handles = []
    handles.append(Patch(facecolor=alpha[0], edgecolor='k', label='No confidence value'))
    handles.append(Patch(facecolor=alpha[1], edgecolor='k', label='Confidence=0.1'))
    ax2.legend(handles=handles, bbox_to_anchor=(1.6,1.03))
    for ax in axs:
        plt.sca(ax)
        plt.xticks(x1, ['' for x in x1])
        plt.semilogy()
        plt.xlim([-0.5, 20.5])
        #plt.ylim([0, 100])
        plt.xlim([-0.5, 20.5])
    for x in x1:
        pl = ((1/21)*(x+1))-0.02
        ax3.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax3.transAxes)
        ax4.text(pl, -0.03, sample_names[x], color=colors[x], rotation=90, va='top', ha='center', transform=ax4.transAxes)
    ax1.set_ylabel('Number of reads'), ax3.set_ylabel('Number of reads')
    #plt.tight_layout()
    return

Percent classified

A summary of the percentage of reads classified as different domains with different databases. Note that the ‘From paper’ plot uses the classifications given in the original paper, where 10,000 unmapped reads were classified using BLAST searches of the NCBI database.

#18
get_summary_reads(kraken_all_db)
plt.show()

Summary of number of bacteria

Summary of the number of reads that are classified as bacteria by each database.

#19
get_summary_bacteria(kraken_all_db)
plt.tight_layout()
plt.show()

Kraken2 taxonomy nMDS plots

These first plots are all separately with the confidence parameter set. See the last tab for those without the confidence parameter set.

#20
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []

for db in range(len(bracken_all_db)):
    d = int(db)
    db = bracken_all_db[db]
    species = list(db.index.values)
    keeping = []
    species_dict = {}
    for sp in species:
        if sp in bacteria:
            keeping.append(True)
            new_sp = sp.split('__')
            if len(new_sp) > 1:
                new_sp = new_sp[1]
            else:
                new_sp = new_sp[0]
            species_dict[sp] = new_sp.split(' ')[0].replace("'", '')
        else:
            keeping.append(False)
    in_len = db.shape[0]
    db = db.loc[keeping, :]
    strain.append(db)
    db = db.rename(index=species_dict)
    db = db.groupby(by=db.index, axis=0).sum()
    sums = db.sum(axis=0)
    gen_sums.append(sums)
    db = db.divide(sums, axis=1).multiply(100)
    genera.append(db)
    db.to_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/'+db_names[d]+'.csv')
    gen_names = gen_names+list(db.index.values)
    db = db[db.max(axis=1) > 1]
    genera_1.append(db)
    gen_names_1 = gen_names_1+list(db.index.values)
gen_names = list(set(gen_names))
gen_names_1 = list(set(gen_names_1))
#21
def plot_four_nmds(dbs, metric, name):
    fig = plt.figure(figsize=(15,10))
    #fig.suptitle(name+metric+'\n\n\n')
    ax1, ax2, ax3, ax4 = plt.subplot(221), plt.subplot(222), plt.subplot(223), plt.subplot(224)
    axs = [ax3, ax1, ax2, ax4]
    ax1.set_title('Minikraken V1\n(without human genome)'), ax2.set_title('Minikraken V2\n(with human genome)'), ax3.set_title('GTDB'), ax4.set_title('RefSeq Complete V93')
    
    for db in range(len(dbs)):
        n = db
        db = dbs[db].transpose()
        pos, npos, stress = transform_for_NMDS(db, metric)
        for a in range(len(npos)):
            axs[n].scatter(npos[a,0], npos[a,1], marker=shapes[a], color=colors[a], s=100, edgecolor='k')
        axs[n].set_xlabel('nMDS1')
        axs[n].set_ylabel('nMDS2')
    handles1 = [Patch(facecolor=colors_dict[color], edgecolor='k', label=color) for color in colors_dict]
    handles2 = [Line2D([0], [0], marker=shapes_dict[shape], color='w', label=shape, markerfacecolor='k', markersize=15) for shape in shapes_dict]

    ax2.legend(handles=handles1+handles2, bbox_to_anchor=(1,1))
    plt.tight_layout()
    return

Bray-Curtis distance strain level

#22
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'braycurtis', 'NMDS confidence=0.1 strain ')
plt.show()

Bray-Curtis distance genus level

#23
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'braycurtis', 'NMDS confidence=0.1 genera ')
plt.show()

Euclidean distance strain level

#24
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'euclidean', 'NMDS confidence=0.1 strain ')
plt.show()

Euclidean distance genus level

#25
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'euclidean', 'NMDS confidence=0.1 genera ')
plt.show()

Jaccard distance strain level

#26
plot_four_nmds([strain[1], strain[3], strain[5], strain[7]], 'jaccard', 'NMDS confidence=0.1 strain ')
plt.show()

Jaccard distance genus level

#27
plot_four_nmds([genera[1], genera[3], genera[5], genera[7]], 'jaccard', 'NMDS confidence=0.1 genera ')
plt.show()

All plots with no confidence parameter set

Bray-curtis distance at strain level

#28
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'braycurtis', 'NMDS no confidence strain ')
plt.show()

Bray-curtis distance at genus level

#29
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'braycurtis', 'NMDS no confidence genera ')
plt.show()

Euclidean distance at strain level

#30
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'euclidean', 'NMDS no confidence strain ')
plt.show()

Euclidean distance at genus level

#31
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'euclidean', 'NMDS no confidence genera ')
plt.show()

Jaccard distance at strain level

#32
plot_four_nmds([strain[0], strain[2], strain[4], strain[6]], 'jaccard', 'NMDS no confidence strain ')
plt.show()

Jaccard distance at genus level

#33
plot_four_nmds([genera[0], genera[2], genera[4], genera[6]], 'jaccard', 'NMDS no confidence genera ')
plt.show()

Kraken2 taxonomy relative abundance

These plots are now only calculated for the classifications that used confidence = 0.1. Genera with below 1% maximum relative abundance are removed and the numbers in brackets are the number of reads that were classified as bacteria.

#34
db_names = ['gtdb', 'gtdb_conf', 'minikraken', 'minikraken_conf', 'minikraken_human', 'minikraken_human_conf', 'refseq', 'refseq_conf']
#bacteria = all_domains['Bacteria']+all_domains['d__Bacteria']
#genera, gen_names, genera_1, gen_names_1, strain, gen_sums = [], [], [], [], [], []
gen_names_1 = sorted(gen_names_1)
def plot_genera(db, sums, tax_cols, gen_names_1, dname):
    plt.figure(figsize=(10,5))
    ax1 = plt.subplot(111)
    bottom = [0 for x in range(len(db.columns))]
    handles = []
    for g in range(len(gen_names_1)):
        if gen_names_1[g] in db.index.values:
            this_row = db.loc[gen_names_1[g], :].values
            ax1.bar(x1, this_row, bottom=bottom, color=tax_cols[g], edgecolor='k')
            handles.append(Patch(facecolor=tax_cols[g], edgecolor='k', label=gen_names_1[g]))
            for b in range(len(bottom)):
                bottom[b] += this_row[b]
    ax1.legend(handles=handles, bbox_to_anchor=(1, 1.03), ncol=3)
    plt.xticks(x1, ['' for x in x1])
    plt.ylabel('Relative abundance(%)')
    plt.xlim([-0.5, 20.5])
    plt.ylim([0, 100])
    for x in x1:
        n = str(int(sums[samples[x]]))
        ax1.text(x, -2, sample_names[x]+' ('+n+')', color=colors[x], rotation=90, va='top', ha='center')
    plt.tight_layout()
    return

gen_plot = [genera_1[1], genera_1[3], genera_1[5], genera_1[7]]
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
all_sums = [gen_sums[1], gen_sums[3], gen_sums[5], gen_sums[7]]
tax_cols = get_cols(len(gen_names_1))

Minikraken v1

#35
plot_genera(gen_plot[1], all_sums[1], tax_cols, gen_names_1, db_name[1])
plt.show()

Minikraken v2

#36
plot_genera(gen_plot[2], all_sums[2], tax_cols, gen_names_1, db_name[2])
plt.show()

GTDB

#37
plot_genera(gen_plot[0], all_sums[0], tax_cols, gen_names_1, db_name[0])
plt.show()

RefSeq Complete v93

#38
plot_genera(gen_plot[3], all_sums[3], tax_cols, gen_names_1, db_name[3])
plt.show()

Kraken2 similarities and differences between body sites

Here I have carried out ANCOM2 tests for differential abundance of genera between body sites. All genera are then plotted on a heatmap with mean values for each body site and stars to denote significant differences as determined by ANCOM.

While many of these results vary between databases, it is worth noting that some differences are consistent, e.g.:
1. Prevotella are always more abundant in saliva samples
2. Streptococcus are always more abundant in buccal samples
3. Klebsiella (where present) are always more abundant in blood samples

#39
def get_differences(new_genus, db_name):
    new_genus = new_genus.drop(['SRR8595488'], axis=1)
    new_genus = new_genus[new_genus.max(axis=1) > 0.1]
    samples = list(new_genus.columns)
    
    list_comparisons = [['Blood', 'Saliva'], ['Blood', 'Buccal'], ['Saliva', 'Buccal'], ['Saliva', 'Saliva_euk'], ['Buccal', 'Buccal_euk'], ['Blood', 'Saliva_euk'], ['Blood', 'Buccal_euk'], ['Saliva_euk', 'Buccal_euk']]
    comparisons, metadata, comp_len = [], [], []
    for a in range(len(list_comparisons)):
        keeping = []
        this_md = []
        for b in range(len(samples)):
            if site_dict[samples[b]] in list_comparisons[a]:
                keeping.append(True)
                this_md.append([samples[b], site_dict[samples[b]]])
            else:
                keeping.append(False)
        this_comp = new_genus.loc[:, keeping]
        this_comp = this_comp[this_comp.max(axis=1) > 0.1]
        comparisons.append(this_comp)
        this_md = pd.DataFrame(this_md, columns=['Samples', 'Groups'])
        #this_md.set_index('Samples', inplace=True)
        metadata.append(this_md)
        comp_len.append(this_comp.shape[0])
    new_genus = new_genus.rename(columns=site_dict, inplace=False)
    return comparisons, metadata, new_genus, comp_len
db_name = ['GTDB', 'Minikraken V1', 'Minikraken V2', 'RefSeq Complete V93']
comparison_names = [r'Blood vs saliva', r'Blood vs buccal', r'Saliva vs buccal', r'Saliva vs saliva_euk', r'Buccal vs buccal_euk', r'Blood vs saliva_euk', r'Blood vs buccal_euk', r'Saliva_euk vs buccal_euk']
source("/Users/robynwright/Documents/OneDrive/Github/R-notebooks/ancom_v2.1.R")

get_ancom <- function(ft, md) {
all_ancom = list()
for (a in 1:8){
  feature_table = ft[a]
  meta_data = md[a]
  process = feature_table_pre_process(feature_table, meta_data, 'Samples', 'Groups', lib_cut=10, neg_lb=TRUE)
  ancom_out = ANCOM(process$feature_table, process$meta_data, process$struc_zero, main_var='Groups')
  all_ancom[[a]] <- ancom_out$out
}
return(all_ancom)
}
def sort_ancom_results(r_ancom):
  ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06 = [], [], [], []
  for a in range(len(r_ancom)):
    this_sig_09, this_sig_08, this_sig_07, this_sig_06 = [], [], [], []
    r_ancom[a].set_index('taxa_id', inplace=True)
    all_sp = list(r_ancom[a].index.values)
    for b in range(len(all_sp)):
      if r_ancom[a].loc[all_sp[b], 'detected_0.9'] == True:
        this_sig_09.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.8'] == True:
        this_sig_08.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.7'] == True:
        this_sig_07.append(all_sp[b])
      if r_ancom[a].loc[all_sp[b], 'detected_0.6'] == True:
        this_sig_06.append(all_sp[b])
    ancom_lists_09.append(this_sig_09), ancom_lists_08.append(this_sig_08), ancom_lists_07.append(this_sig_07), ancom_lists_06.append(this_sig_06)
  return [ancom_lists_09, ancom_lists_08, ancom_lists_07, ancom_lists_06]
def plot_heatmap(new_genus, ANCOM):
    print(ANCOM)
    names = ['Blood', 'Saliva', 'Buccal', 'Saliva_euk', 'Buccal_euk']
    other_names = ['Abundance', r'Blood $vs$ saliva', r'Blood $vs$ buccal', r'Saliva $vs$ buccal', r'Saliva $vs$ saliva_euk', r'Buccal $vs$ buccal_euk', r'Blood $vs$ saliva_euk', r'Blood $vs$ buccal_euk', r'Saliva_euk $vs$ buccal_euk']
    colormap, norm = mpl.cm.get_cmap('plasma', 256), mpl.colors.Normalize(vmin=0, vmax=1)
    m = mpl.cm.ScalarMappable(norm=norm, cmap=colormap)
    if list(new_genus.index.values)[0][0] == 'A':
        new_genus = new_genus.iloc[::-1]
    figure = plt.figure(figsize=(5,new_genus.shape[0]*0.2))
    ax1 = plt.subplot(111)
    genus_names = list(new_genus.index.values)
    y = []
    for g in range(len(genus_names)):
        this_row = new_genus.loc[genus_names[g], :]
        values = [(np.mean(this_row[name].values)) for name in names]
        bottom, top, x = [g for a in range(5)], [1 for a in range(5)], [a for a in range(5)]
        y.append(g+0.5)
        ma = max(values)
        values = [v/ma for v in values]
        values = [m.to_rgba(v) for v in values]
        ax1.bar(x, top, bottom=bottom, color=values, edgecolor='k', width=1)
        x.append(5)
        ax1.scatter(x[-1], bottom[-1]+0.5, color='k', s=ma*5)
        sig, x_plt = [], x[-1]
        for a in range(len(ANCOM)):
            x_plt += 0.75
            if genus_names[g] in ANCOM[a]: ax1.scatter(x_plt, bottom[-1]+0.5, marker='*', color='k')
            x.append(x_plt)
    plt.ylim([0, len(genus_names)]), plt.xlim([-0.5, x[-1]+0.5])
    plt.yticks(y, genus_names)
    plt.xticks(x, names+other_names, rotation=90)
    ax1.xaxis.set_ticks_position('top')
    plt.tight_layout()
    plt.show()
    return

Minikraken v1 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 41 41 37 26 8
Blood vs buccal 37 27 21 15 3
Saliva vs buccal 45 3 3 2 1
Saliva vs saliva_euk 41 0 0 0 0
Buccal vs buccal_euk 39 2 0 0 0
Blood vs saliva_euk 38 32 24 15 3
Blood vs buccal_euk 34 11 5 2 2
Saliva_euk vs buccal_euk 43 5 3 2 1

Minikraken v1 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Burkholderia', 'Enterobacter', 'Klebsiella', 'Paracoccus', 'Pasteurella', 'Prevotella', 'Rothia', 'Staphylococcus'], ['Burkholderia', 'Rothia', 'Streptococcus'], ['Prevotella'], [], [], ['Burkholderia', 'Prevotella', 'Rothia'], ['Rothia', 'Streptococcus'], ['Prevotella']]
## 
## <string>:9: RuntimeWarning: More than 20 figures have been opened. Figures created through the pyplot interface (`matplotlib.pyplot.figure`) are retained until explicitly closed and may consume too much memory. (To control this warning, see the rcParam `figure.max_open_warning`).

Minikraken v2 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/minikraken_human_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 49 48 47 38 9
Blood vs buccal 48 35 16 14 4
Saliva vs buccal 39 5 4 1 0
Saliva vs saliva_euk 30 0 0 0 0
Buccal vs buccal_euk 37 0 0 0 0
Blood vs saliva_euk 49 47 33 22 5
Blood vs buccal_euk 50 19 14 12 4
Saliva_euk vs buccal_euk 42 4 3 3 2

Minikraken v2 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Capnocytophaga', 'Clostridium', 'Klebsiella', 'Mogibacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptomyces', 'Veillonella'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], [], [], [], ['Campylobacter', 'Klebsiella', 'Prevotella', 'Pseudomonas', 'Streptomyces'], ['Klebsiella', 'Rothia', 'Streptococcus', 'Streptomyces'], ['Prevotella', 'Streptococcus']]

GTDB ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/gtdb_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 98 88 82 32 6
Blood vs buccal 91 21 14 8 5
Saliva vs buccal 96 18 10 2 1
Saliva vs saliva_euk 93 3 2 0 0
Buccal vs buccal_euk 95 8 5 1 0
Blood vs saliva_euk 92 27 17 11 4
Blood vs buccal_euk 75 7 7 3 2
Saliva_euk vs buccal_euk 97 19 13 7 1

GTDB heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Mycobacterium', 'Pauljensenia', 'Prevotella', 'Psychrobacter', 'Streptococcus', 'Veillonella'], ['Actinomyces', 'Prevotella', 'Rothia', 'Streptococcus', 'Veillonella'], ['Prevotella'], [], [], ['Acinetobacter', 'Prevotella', 'Streptococcus', 'Veillonella'], ['Acinetobacter', 'Poseidonibacter'], ['Prevotella']]

RefSeq Complete v93 ANCOM

These differences are determined by ANCOM2 and the most conservative of these is the ANCOM 0.9 set, which is then used for plotting heatmaps.

this_genera_db = pd.read_csv('/Users/robynwright/Documents/OneDrive/Github/Human_metagenome/kraken2/db_genera/refseq_conf.csv', header=0, index_col=0)
comparisons, md, new_genus, comp_len = get_differences(this_genera_db, db_name[1])
c1, c2, c3, c4, c5, c6, c7, c8 = comparisons[0], comparisons[1], comparisons[2], comparisons[3], comparisons[4], comparisons[5], comparisons[6], comparisons[7]
md1, md2, md3, md4, md5, md6, md7, md8 = md[0], md[1], md[2], md[3], md[4], md[5], md[6], md[7]
ft = list(py$c1, py$c2, py$c3, py$c4, py$c5, py$c6, py$c7, py$c8)
md = list(py$md1, py$md2, py$md3, py$md4, py$md5, py$md6, py$md7, py$md8)
all_ancom = get_ancom(ft, md)
ancom = sort_ancom_results(r.all_ancom)
ancom_df = []
for c in range(len(comparison_names)):
    this_row = [comparison_names[c], comp_len[c], len(ancom[3][c]), len(ancom[2][c]), len(ancom[1][c]), len(ancom[0][c])]
    ancom_df.append(this_row)
ancom_df = pd.DataFrame(ancom_df, columns=['Comparison', 'Shared genera', 'ANCOM 0.6', 'ANCOM 0.7', 'ANCOM 0.8', 'ANCOM 0.9'])
py$ancom_df %>%
  kable() %>%
  kable_styling()
Comparison Shared genera ANCOM 0.6 ANCOM 0.7 ANCOM 0.8 ANCOM 0.9
Blood vs saliva 85 81 77 35 11
Blood vs buccal 82 65 36 21 7
Saliva vs buccal 69 9 6 3 0
Saliva vs saliva_euk 64 0 0 0 0
Buccal vs buccal_euk 76 4 4 1 0
Blood vs saliva_euk 86 72 37 20 4
Blood vs buccal_euk 79 18 15 8 2
Saliva_euk vs buccal_euk 84 12 10 5 1

RefSeq Complete v93 heatmap

Relative abundance is scaled within each genus, with yellow being highest in abundance and purple lowest. Blobs are scaled to maximum relative abundance. Stars denote genera that are significantly differentially abundant between body sites (ANCOM2 0.9).

plot_heatmap(new_genus, ancom[0])
## [['Bordetella', 'Enterococcus', 'Escherichia', 'Klebsiella', 'Mesorhizobium', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Salmonella', 'Staphylococcus', 'Veillonella'], ['Actinomyces', 'Mycobacterium', 'Prevotella', 'Pseudomonas', 'Rothia', 'Streptococcus', 'Veillonella'], [], [], [], ['Mycobacterium', 'Prevotella', 'Pseudomonas', 'Veillonella'], ['Actinomyces', 'Veillonella'], ['Prevotella']]